Library needed :
library(tidyverse)
library(gghighlight)
library(patchwork)
Data used in this work is House Sale Prices in Ames, Iowa, USA from 2006-2010, downloaded from Kaggle : https://www.kaggle.com/c/house-prices-advanced-regression-techniques .
df_ori=read.csv('train.csv')
df_ori=tbl_df(df_ori)
df_ori
All columns in dataset :
glimpse(df_ori)
## Rows: 1,460
## Columns: 81
## $ Id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1~
## $ MSSubClass <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20,~
## $ MSZoning <chr> "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RM", "R~
## $ LotFrontage <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, ~
## $ LotArea <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 612~
## $ Street <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", ~
## $ Alley <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
## $ LotShape <chr> "Reg", "Reg", "IR1", "IR1", "IR1", "IR1", "Reg", "IR1", ~
## $ LandContour <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", ~
## $ Utilities <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "AllPu~
## $ LotConfig <chr> "Inside", "FR2", "Inside", "Corner", "FR2", "Inside", "I~
## $ LandSlope <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", ~
## $ Neighborhood <chr> "CollgCr", "Veenker", "CollgCr", "Crawfor", "NoRidge", "~
## $ Condition1 <chr> "Norm", "Feedr", "Norm", "Norm", "Norm", "Norm", "Norm",~
## $ Condition2 <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", ~
## $ BldgType <chr> "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", ~
## $ HouseStyle <chr> "2Story", "1Story", "2Story", "2Story", "2Story", "1.5Fi~
## $ OverallQual <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5,~
## $ OverallCond <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5,~
## $ YearBuilt <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 19~
## $ YearRemodAdd <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 19~
## $ RoofStyle <chr> "Gable", "Gable", "Gable", "Gable", "Gable", "Gable", "G~
## $ RoofMatl <chr> "CompShg", "CompShg", "CompShg", "CompShg", "CompShg", "~
## $ Exterior1st <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Sdng", "VinylSd", "~
## $ Exterior2nd <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Shng", "VinylSd", "~
## $ MasVnrType <chr> "BrkFace", "None", "BrkFace", "None", "BrkFace", "None",~
## $ MasVnrArea <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306, ~
## $ ExterQual <chr> "Gd", "TA", "Gd", "TA", "Gd", "TA", "Gd", "TA", "TA", "T~
## $ ExterCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T~
## $ Foundation <chr> "PConc", "CBlock", "PConc", "BrkTil", "PConc", "Wood", "~
## $ BsmtQual <chr> "Gd", "Gd", "Gd", "TA", "Gd", "Gd", "Ex", "Gd", "TA", "T~
## $ BsmtCond <chr> "TA", "TA", "TA", "Gd", "TA", "TA", "TA", "TA", "TA", "T~
## $ BsmtExposure <chr> "No", "Gd", "Mn", "No", "Av", "No", "Av", "Mn", "No", "N~
## $ BsmtFinType1 <chr> "GLQ", "ALQ", "GLQ", "ALQ", "GLQ", "GLQ", "GLQ", "ALQ", ~
## $ BsmtFinSF1 <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 99~
## $ BsmtFinType2 <chr> "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "BLQ", ~
## $ BsmtFinSF2 <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ BsmtUnfSF <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 17~
## $ TotalBsmtSF <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 10~
## $ Heating <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", ~
## $ HeatingQC <chr> "Ex", "Ex", "Ex", "Gd", "Ex", "Ex", "Ex", "Ex", "Gd", "E~
## $ CentralAir <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "~
## $ Electrical <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "S~
## $ X1stFlrSF <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, ~
## $ X2ndFlrSF <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0,~
## $ LowQualFinSF <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ GrLivArea <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 10~
## $ BsmtFullBath <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,~
## $ BsmtHalfBath <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ FullBath <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1,~
## $ HalfBath <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,~
## $ BedroomAbvGr <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3,~
## $ KitchenAbvGr <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,~
## $ KitchenQual <chr> "Gd", "TA", "Gd", "Gd", "Gd", "TA", "Gd", "TA", "TA", "T~
## $ TotRmsAbvGrd <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6~
## $ Functional <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", ~
## $ Fireplaces <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0,~
## $ FireplaceQu <chr> NA, "TA", "TA", "Gd", "TA", NA, "Gd", "TA", "TA", "TA", ~
## $ GarageType <chr> "Attchd", "Attchd", "Attchd", "Detchd", "Attchd", "Attch~
## $ GarageYrBlt <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, 19~
## $ GarageFinish <chr> "RFn", "RFn", "RFn", "Unf", "RFn", "Unf", "RFn", "RFn", ~
## $ GarageCars <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2,~
## $ GarageArea <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 7~
## $ GarageQual <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "Fa", "G~
## $ GarageCond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T~
## $ PavedDrive <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "~
## $ WoodDeckSF <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160~
## $ OpenPorchSF <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213,~
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, ~
## $ X3SsnPorch <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ ScreenPorch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, ~
## $ PoolArea <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ PoolQC <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
## $ Fence <chr> NA, NA, NA, NA, NA, "MnPrv", NA, NA, NA, NA, NA, NA, NA,~
## $ MiscFeature <chr> NA, NA, NA, NA, NA, "Shed", NA, "Shed", NA, NA, NA, NA, ~
## $ MiscVal <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700,~
## $ MoSold <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10~
## $ YrSold <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 20~
## $ SaleType <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "W~
## $ SaleCondition <chr> "Normal", "Normal", "Normal", "Abnorml", "Normal", "Norm~
## $ SalePrice <int> 208500, 181500, 223500, 140000, 250000, 143000, 307000, ~
To execute this command, we will analyze the count of each type of foundation used for the houses. The corresponding variable is Foundation which values consist of :
df_ori %>%
group_by(Foundation) %>%
summarise(nbrow=n()) %>%
ggplot(aes(x = Foundation,y=nbrow))+ylim(0,850)+
geom_bar(stat='identity',fill = "#00AFBB")+
geom_label(aes(label=nbrow))+
labs(title = 'Count of Foundation Variable',subtitle = 'Cinder block and poured concrete are the most common foundation', x ='Foundation', y='Count')+theme_bw()+gghighlight(max(nbrow)>600,unhighlighted_params = aes(fill=NULL,color=NULL))
By visualizing the count of Foundation variable, Cinder Block and Poured Concrete are easily recognized as the most common foundation used for houses in Ames,Iowa (2006-2010). Other types of foundation such as slab, stone, and wood are considered rare meanwhile brick and tile foundation is still used for hundred of houses.
SalePrice variable is analyzed in this section and plotted by using the histogram. Since it’s very likely that the histogram will be positively skewed, Log transformation will be applied to make the distribution more ‘normal’. Then, the histogram and the boxplot before and after this transformation will be visualized side to side to make sure whether the transformation works.
#Create new columns LogSalePrice that contains result of log-transformation
df_ori=df_ori %>%
mutate(LogSalePrice=log(SalePrice))
#Histogram of Sale Price after transformed
p1=df_ori %>%
ggplot(aes(x=LogSalePrice))+geom_histogram(bins = 30,fill='white',color='black',aes(y=..density..),size=0.5)+geom_density(fill='#E69F00',alpha=0.2,size=0.7,aes(y=..density..))+theme_minimal()+geom_vline(aes(xintercept=mean(LogSalePrice)),color="blue", linetype="dashed", size=1)+labs(x='log(SalePrice)')
#Histogram of Sale Price before transformed
p2=df_ori %>%
ggplot(aes(x=SalePrice))+geom_histogram(bins = 30,fill='white',color='black',aes(y=..density..),size=0.5)+geom_density(fill='#E69F00',alpha=0.2,size=0.7,aes(y=..density..))+theme_minimal()+geom_vline(aes(xintercept=mean(SalePrice)),color="blue", linetype="dashed", size=1)
#Combine both graphs
(p2+p1)+plot_annotation(title='The Histogram of SalePrice Variable',subtitle ='Log transformation successfully transforms the positive-skewed SalePrice distribution into normal distribution')
One thing that immediately appears on my mind when I see this dataset is what is the relationship between Year Sold (YrSold) and Sale Price (LogSalePrice). Please note that even though year is numeric, we can’t say that year is a continuous variable since (2006+2007/2) or (2006*2007) do not have any meaning. That’s why i think it’s more reasonable to classify year as categorical variable.
df_ori %>% ggplot(aes(x=YrSold,y=LogSalePrice))+geom_jitter(color='gray')+geom_smooth(color='black',formula = y ~ s(x, bs = "cs", k = 5))+theme_minimal()+labs(title='Sale Price vs Year Sold',subtitle = 'Houses price tended to be stagnant during 2006-2010 period',y='log(Sale Price)',x='Year Sold')
It’s very clear to see that the prices were stagnant during 2006-2010. This happened because during these year, US was affected by phenomenon called the Great Recession. These years was surely a nightmare for all real estate inventors because the prices didn’t raise year after year.
Another interesting thing to be analyzed is the relationship between neighborhood and the sale price, which will give us informations about where the elite area is, etc. According to Google, neighborhood is local geographic area with similar characteristics. It may be referred to by name (e.g., Brooklyn Heights, Palisades) and have designated boundaries. Common practices to plot this kind of data is by using boxplots, or facets. But in this exercise, i want to try the ridgeline plot.
library(ggridges) #library to plot the ridgline plot
df_ori %>%
ggplot(aes(x=SalePrice/1000,y=Neighborhood,fill=Neighborhood))+geom_density_ridges()+theme_minimal()+labs(x='Sale Prices in 1000$',y='Neighborhood')+theme_ridges(font_size = 11,center_axis_labels = TRUE)+theme(legend.position = 'none')+labs(title='Sale Price for Each Neighborhood',subtitle='NRidgHt, NoRidge, StoneBr have the most expensive and diverse prices')
From above graph, we can interpret that StoneBr, NoRidge and NridgHt are considered neighborhoods with the most expensive and diverse house prices meanwhile MeadowV is the cheapest among all neighborhoods.
Cheapest median :
df_ori %>%
group_by(Neighborhood) %>%
summarise(median=median(SalePrice)) %>%
arrange(median) %>%
head(5)
Highest median:
df_ori %>%
group_by(Neighborhood) %>%
summarise(median=median(SalePrice)) %>%
arrange(desc(median)) %>%
head(5)
One more interesting thing to be analyzed is which Area that have strong correlation to the target variable, SalePrice. First, let’s make a new dataframe that contains all variable name with ‘Area’ in its name and also SalePrice.
df_new=df_ori %>%
select(contains('Area'))
df_new=df_new %>% mutate(SalePrice=df_ori$SalePrice)
df_new
cor(df_new)
## LotArea MasVnrArea GrLivArea GarageArea PoolArea SalePrice
## LotArea 1.00000000 NA 0.2631162 0.18040276 0.07767239 0.26384335
## MasVnrArea NA 1 NA NA NA NA
## GrLivArea 0.26311617 NA 1.0000000 0.46899748 0.17020534 0.70862448
## GarageArea 0.18040276 NA 0.4689975 1.00000000 0.06104727 0.62343144
## PoolArea 0.07767239 NA 0.1702053 0.06104727 1.00000000 0.09240355
## SalePrice 0.26384335 NA 0.7086245 0.62343144 0.09240355 1.00000000
Remove MasVnrArea because it contains NA pearson correlation values, and add few more columns which are the results of log transformation.
df_new = df_new %>% select(-MasVnrArea)
df_new = df_new %>% mutate(LogSalePrice=log(SalePrice),LogLotArea=log(LotArea),LogGrLivArea=log(GrLivArea),LogPoolArea=log(PoolArea),LogGarageArea=log(GarageArea))
cor(df_new)
## LotArea GrLivArea GarageArea PoolArea SalePrice
## LotArea 1.00000000 0.2631162 0.18040276 0.07767239 0.26384335
## GrLivArea 0.26311617 1.0000000 0.46899748 0.17020534 0.70862448
## GarageArea 0.18040276 0.4689975 1.00000000 0.06104727 0.62343144
## PoolArea 0.07767239 0.1702053 0.06104727 1.00000000 0.09240355
## SalePrice 0.26384335 0.7086245 0.62343144 0.09240355 1.00000000
## LogSalePrice 0.25731989 0.7009267 0.65088756 0.06979781 0.94837373
## LogLotArea 0.69794532 0.3945774 0.32204548 0.09179094 0.38852027
## LogGrLivArea 0.24535747 0.9663720 0.47616021 0.10974441 0.69511807
## LogPoolArea NaN NaN NaN NaN NaN
## LogGarageArea NaN NaN NaN NaN NaN
## LogSalePrice LogLotArea LogGrLivArea LogPoolArea LogGarageArea
## LotArea 0.25731989 0.69794532 0.2453575 NaN NaN
## GrLivArea 0.70092665 0.39457745 0.9663720 NaN NaN
## GarageArea 0.65088756 0.32204548 0.4761602 NaN NaN
## PoolArea 0.06979781 0.09179094 0.1097444 NaN NaN
## SalePrice 0.94837373 0.38852027 0.6951181 NaN NaN
## LogSalePrice 1.00000000 0.39991774 0.7302549 NaN NaN
## LogLotArea 0.39991774 1.00000000 0.3854352 NaN NaN
## LogGrLivArea 0.73025485 0.38543520 1.0000000 NaN NaN
## LogPoolArea NaN NaN NaN 1 NaN
## LogGarageArea NaN NaN NaN NaN 1
We get NA values for LogPoolArea and LogGarageArea because not all houses have Garage and Pool (0 area)
df_new=df_new %>% select(-LogPoolArea,-LogGarageArea)
library(reshape2)
get_lower_tri<-function(cormat){
cormat[lower.tri(cormat)] <- NA
return(cormat)
}
df_cor=round(cor(df_new),2)
melt_df_cor=get_lower_tri(df_cor)
Heatmap for Pearson Correlation Matrix :
melt_df_cor=melt(melt_df_cor,na.rm = TRUE)
melt_df_cor %>%
ggplot(aes(Var2, Var1, fill = value))+
geom_tile(color = "black")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 10, hjust = 1))+geom_text(aes(Var2,Var1,label=value))+labs(title='Pearson Correlation Heatmap',subtitle='LogSalePrice vs LogGrLivArea have the highest correlation',x='',y='')
Based on the heatmap, we can see that all area variables have positive correlations which is reasonable. Highest correlation found in the relationship between LogSalePrice and LogGrLivArea (Ground Living Area), i.e 0.73, slightly higher than correlation between SalePrice and GrLivArea, i.e 0.71. This result proves that GrLivArea (Ground Living Area) is one of the strong predictors for the target variable SalePrice.In the end, let’s do scatter plotting with linear regression for LogGrLivArea and LogSalePrice.
df_ori %>%
ggplot()+geom_point(aes(x=log(GrLivArea),y=LogSalePrice,color=Neighborhood))+geom_smooth(aes(x=log(GrLivArea),y=LogSalePrice),method = 'lm',color='black')+theme_minimal()+labs(title= 'LogGrLivArea vs LogSalePrice',subtitle='The linear regression line fits the scatter point nicely',x='LogGrLivArea',y='LogSalePrice')
One of my milestones for data visualization that i set since i started learning this data science field is able to create geospatial map visualization. Hence in this section, i really want to create the map visualization for houses in Ames related to this data. Unfortunately, i face two big problems :
To tackle this two problems (which i found out really difficult to solve), the dataset that will be used is Ames Housing data from the library AmesHousing itself, not from the train.csv downloaded from kaggle.
library(AmesHousing) #For extract latitude and longitude information
df_1=make_ames()
ames_df=tbl_df(df_1)
ames_df
library(jpeg)
library(grid)
googlemap <- readJPEG("D:\\Learning_r\\Cleaning_dATA\\Cleaning_Data_Project\\Project_Cleaning\\HW_Day12_Indra\\map_ames.jpg") #reading JPG image for the Ames, Iowa Map taken from openstreetmap.org
Transform the full name of neighborhoods in ames_df dataset into its initial from Kaggle.
Neigh=function(x){
if (x=='North_Ames'){return('NAmes')}
if (x=='Gilbert'){return('Gilbert')}
if (x=='Stone_Brook'){return('StoneBr')}
if (x=='Northwest_Ames'){return('NWAmes')}
if (x=='Somerset'){return('Somerst')}
if (x=='Briardale'){return('BrDale')}
if (x=='Northpark_Villa'){return('NPkVill')}
if (x=='Northridge_Heights'){return('NridgHt')}
if (x=='Bloomington_Heights'){return('Blmngtn')}
if (x=='Northridge'){return('NoRidge')}
if (x=='Sawyer_West'){return('SawyerW')}
if (x=='Sawyer'){return('Sawyer')}
if (x=='Greens'){return('Greens')}
if (x=='Brookside'){return('BrkSide')}
if (x=='Old_Town'){return('OldTown')}
if (x=='Iowa_DOT_and_Rail_Road'){return('IDOTRR')}
if (x=='Clear_Creek'){return('ClearCr')}
if (x=='South_and_West_of_Iowa_State_University'){return('SWISU')}
if (x=='Edwards'){return('Edwards')}
if (x=='College_Creek'){return('CollgCr')}
if (x=='Crawford'){return('Crawfor')}
if (x=='Blueste'){return('Blueste')}
if (x=='Mitchell'){return('Mitchel')}
if (x=='Timberland'){return('Timber')}
if (x=='Meadow_Village'){return('MeadowV')}
if (x=='Veenker'){return('Veenker')}
if (x=='Green_Hills'){return('GreenH')}
if (x=='Landmark'){return('Landm')}
}
Neigh=Vectorize(Neigh)
ames_df=ames_df %>% mutate(Neigh_Initial=Neigh(Neighborhood))
ames_df_locate=ames_df %>% group_by(Neigh_Initial) %>%
summarise(Latitude=mean(Latitude),Longitude=mean(Longitude))
ames_df_locate
I create this new dataframe in order to label the 28 neighborhoods (not 2930 labels, but only 28).
ames_df %>% filter(!is.na(Latitude)) %>%
ggplot(aes(x=Longitude,y=Latitude,color=Neigh_Initial)) +
annotation_custom(rasterGrob(googlemap,
width = unit(1,"npc"),
height = unit(1,"npc")),
-Inf, Inf, -Inf, Inf) +
scale_y_continuous(limits = c(41.9700,42.0800)) +
scale_x_continuous(limits = c(-93.7100,-93.5600)) +
geom_point()+geom_label(data=ames_df_locate,label=ames_df_locate$Neigh_Initial,alpha=0.8,size=3)+labs(title='Map of Neighborhoods in Ames,Iowa')
And this is the map of Neighborhoods in Ames, Iowa. Next, we will plot the sale prices based on this map
ames_df %>% filter(!is.na(Latitude)) %>%
ggplot(aes(x=Longitude,y=Latitude,color=cut_number(Sale_Price/1000,6))) +
annotation_custom(rasterGrob(googlemap,
width = unit(1,"npc"),
height = unit(1,"npc")),
-Inf, Inf, -Inf, Inf) +
scale_y_continuous(limits = c(41.9700,42.0800)) +
scale_x_continuous(limits = c(-93.7100,-93.5600)) +
geom_point() +
labs(title="House Prices by Location",subtitle='Highest prices found in NridgHt, NoRidge, and StoneBr') +
scale_color_discrete(name="Price in $1000")
The conclusion we get from this map visualization is similar as before, i.e The neighborhoods with the highest prices are NridgHt, NoRidge StoneBr, Timber, Somerst meanwhile the lowest prices are Meadowv, IDOTRR, BrDale, OldTown, and Edwards.
P.s. : maybe in the next work i will adjust the neighborhood labels a little bit so it becomes more clear to see.
Credit to these webs, that help me by giving insights about this data and ggplot2 ,especially the procedures to create geospatial visualization :